#install.packages("R.matlab")
#library(R.matlab)
library(ncdf4)
library(ggplot2)
require(data.table)
library(dplyr)
library(plyr)
library(ggpubr) 
#require(tidyverse) #for easy data manipulation and visualization
#require(caret) #for easy machine learning workflow
#library(mgcv) #GAM

wd1 <- "C:\\Users\\YFan\\OneDrive - NORCE\\NFood_replication\\"

crops = c("Corn", "Wheat", "Soybean", "Cotton", "Rice", "Sugarcane", "Tropical Corn", "Tropical Soybean")


#====================Final Regression on each individual crop grid cell, using climate and yield change data across 2020-2099
##(here do regression for each grid cell at a time and then use Monte Carlo resampling to estimate the probability distribution of coefficients and effects)
#Do not merge temperate and tropical corn or soybean at this step, otherwise some grid samples are duplicated for corn or soybean due to coexistance of temperate/tropical cultivars
#otherwise, model prediction with ldply(,predict) will fail due to different number of instance per sample
#Read input data prepared by "Data2.Input-for-MLR-analysis.R"
dif.8crop.s <- readRDS(file = paste0(wd1,"/data/RData/climate-yield-changedata-MLR-selected.Rds"))

#======build MLR regression for each grid cell for crop type, using scenario differencing data across 2020-2099 
#levs <- c("Corn", "Sugarcane", "Wheat", "Rice", "Soybean", "Cotton")
(scenario<-unique(dif.8crop.s$Scenario))
(crops <- unique(dif.8crop.s$Crop))

library(plyr)
models.irrig <- dif.8crop.s[Irrig==TRUE]  %>% 
  mutate(Crop = factor(Crop, levels=crops)) %>%
  mutate(Scenario = factor(Scenario, levels=scenario)) %>%
  dlply(c("Scenario", "Crop", "Sample"), function(df)lm(yield.dif ~ tsa.dif+radd.dif+radi.dif, data = df, na.action = na.omit))
models.rainfed <- dif.8crop.s[Irrig==FALSE]  %>% 
  mutate(Crop = factor(Crop, levels=crops)) %>%
  mutate(Scenario = factor(Scenario, levels=scenario)) %>%
  dlply(c("Scenario", "Crop", "Sample"), function(df)lm(yield.dif ~ tsa.dif+radd.dif+radi.dif+ppt.dif+rh.dif, data = df, na.action = na.omit))



#=====check multicollinearity between independent variables
#A general guideline is that a VIF larger than 5 or 10 is large, indicating that the model has problems estimating the coefficient. However, this in general does not degrade the quality of predictions. 
#If the VIF is larger than 1/(1-R2), where R2 is the Multiple R-squared of the regression, then that predictor is more related to the other predictors than it is to the response.(see R guide on VIF)

#The smallest possible value of VIF is one (absence of multicollinearity). As a rule of thumb, a VIF value that exceeds 5 or 10 indicates a problematic amount of collinearity (James et al. 2014).
#When faced to multicollinearity, the concerned variables should be removed, since the presence of multicollinearity 
#implies that the information that this variable provides about the response is redundant in the presence of the other variables (James et al. 2014,P. Bruce and Bruce (2017)).
#see http://www.sthda.com/english/articles/39-regression-model-diagnostics/160-multicollinearity-essentials-and-vif-in-r/

library(car)
cor.i <- ldply(models.irrig, vif)   #Variance Inflation Factor (VIF)
cor.r <- ldply(models.rainfed, vif) #collinearity is small < 2
cor.i <- as.data.table(cor.i); 
cor.i[, lapply(.SD, mean, na.rm=TRUE), by =.(Scenario,Crop)] #mean VIFs of all predictors are <5 for irrigated crops 
cor.r <- as.data.table(cor.r)
cor.r[, lapply(.SD, mean, na.rm=TRUE), by =.(Scenario,Crop)] #mean VIFs of radi and rh are >5 and <10 for rainfed crops


#obtain regression coefficients
coeff.i.samples <- as.data.table(ldply(models.irrig, coef)) 
coeff.r.samples <- as.data.table(ldply(models.rainfed, coef))
names(coeff.i.samples)[-(1:3)] <- c("Intercept", "T", "RD", "RI")
names(coeff.r.samples)[-(1:3)] <- c("Intercept", "T", "RD", "RI", "P", "RH") #P,RH for moisture/water availability
coeff.i.samples$Level="Mean"
coeff.r.samples$Level="Mean"
#NOTE: use Monte Carlo/Bootstrap resampling to estimate the uncertainty of projection and coefficients for a given region or globally 
# #(do not use) mean coeffs of all samples (averaging log effects will exaggerate the variability of final global net effects in % relative to CNTL yield) 
# #the following mean coeffs are just one instance of the Monte Carlo resample mean
coeff.i.samples.m <- coeff.i.samples[,.(Intercept=mean(Intercept), T=mean(T), RD=mean(RD), RI=mean(RI)), by=.(Scenario,Crop)]
coeff.r.samples.m <- coeff.r.samples[,.(Intercept=mean(Intercept), T=mean(T), RD=mean(RD), RI=mean(RI), P=mean(P), RH=mean(RH) ), by=.(Scenario,Crop)]
coeff.i.samples.m[Scenario=="SAI - RCP85", RI/RD, by=.(Scenario,Crop)]
coeff.r.samples.m[Scenario=="SAI - RCP85", RI/RD, by=.(Scenario,Crop)]
coeff.i.samples.m[Scenario=="SAI - RCP85", mean(RI/RD), by=.(Scenario)] #average 1.8 for irrigated
coeff.r.samples.m[Scenario=="SAI - RCP85", mean(RI/RD), by=.(Scenario)] #average 1.18
#all crops get good diffuse:direct effect ratio, from 0.54 of wheat to 1.76 of corn, average 1.19, very similar to RI<1.2RD (section III.5) Proctor et al


#====select only the sample coefficients with all VIFs < 5 or <10; 
names(cor.i)[-(1:3)] <- c("VIF.T", "VIF.RD", "VIF.RI")
names(cor.r)[-(1:3)] <- c("VIF.T", "VIF.RD", "VIF.RI", "VIF.P", "VIF.RH") 
cor.i.s <- cor.i[VIF.T<10 & VIF.RD<10 & VIF.RI<10] #only use those samples with VIF <10 for all predictors 
length(unique(cor.i.s$Sample))-length(unique(cor.i$Sample))
(length(unique(cor.i.s$Sample))-length(unique(cor.i$Sample)))/length(unique(cor.i$Sample))*100 
#only 8 (0.24%) with any VIF >10; 240 (8%) samples removed when any variable has VIF >5;
cor.r.s <- cor.r[VIF.T<10 & VIF.RD<10 & VIF.RI<10 & VIF.P<10 & VIF.RH<10] #only use those samples with VIF <10 for all predictors 
length(unique(cor.r.s$Sample))-length(unique(cor.r$Sample))
(length(unique(cor.r.s$Sample))-length(unique(cor.r$Sample)))/length(unique(cor.r$Sample))*100 
#247 (4%) unique grids removed when any variable has VIF >10 for rainfed crops


#majority of sampels have good regression coefficients (>95%)
coeff.i.samples.vif <- coeff.i.samples[cor.i.s, on=.(Scenario,Crop,Sample)] 
coeff.r.samples.vif <- coeff.r.samples[cor.r.s, on=.(Scenario,Crop,Sample)]                                 

cor.i.s$Irrig=TRUE; cor.r.s$Irrig=FALSE
dif.8crop.vif <- rbind(dif.8crop.s[cor.i.s, on=.(Scenario,Crop,Irrig,Sample)],
                       dif.8crop.s[cor.r.s, on=.(Scenario,Crop,Irrig,Sample)], fill=TRUE)
unique(dif.8crop.vif$Year) 
dif.8crop.vif <- dif.8crop.vif[order(Year)] #reorder the data by year
#97.9% of crop grid samples have VIF <10 for all independent variables, only 139 (2%) have VIF>10
length(unique(dif.8crop.vif$Sample))/length(unique(dif.8crop.s$Sample))*100 #97.9%


save(list = c("dif.8crop.s", "coeff.i.samples", "coeff.r.samples"), 
     file = paste0(wd1,"/data/RData/MLR-pergridregression.Rdata")) 
save(list = c("cor.i", "cor.r", "cor.i.s", "cor.r.s", "dif.8crop.vif", "coeff.i.samples.vif", "coeff.r.samples.vif"), 
     file = paste0(wd1,"/data/RData/MLR-pergridregression-VIF.Rdata")) 

rm(models.irrig, models.rainfed)
#rm(dif.8crop,dif.8crop.s,dif.8crop.ma)


#========finally resample partial and total effects predicted by MLR===========
#See "Data3.MLR-log-bootstrap-resampling.R" for resampled MLR predictions of partial and total effects for each crop type and each scenario



# #=======bootstrap model coefficients and errors (not used)========
# #The bootstrap approach does not rely on any of these assumptions made by the linear model, and so it is likely giving a more accurate estimate of the coefficients standard errors than is the summary() function.
# #it is common to estimate the percentile confidence interval of coefficients and predictions using Bootstrap resmpling
# #Bootstrap Method is a resampling method that is commonly used in Data Science. It has been introduced by Bradley Efron in 1979.
# #Ref: Davison, A.C. and Hinkley, D.V., 1997. Bootstrap methods and their application (No. 1). Cambridge university press.
# 
# library(dplyr)
# library(rsample)
# library(broom)
# library(purrr) #use purrr::map to apply a function to all the bootstrap samples at once
# 
# 
# #==define helper function to obtain crop weighted average climate effect per year using per-grid regression
# bs.coeff <- function(split) {
#   #=======resample irrig/rainfed crops together
#   sample.s <- analysis(split)  #boots sample from one scenario within each crop type based on sample.data
#   
#   #==per-grid MLR coefficients using the sample ID from above
#   coeff.s <- data[sample.s[,c("Scenario","Crop","Sample")], on=.(Crop,Sample)] #use the same sample indices of each crop for all scenarios
#   coeff.m <- coeff.s[,.(Intercept=mean(Intercept), T=mean(T), RD=mean(RD), RI=mean(RI), P=mean(P), RH=mean(RH) ), by=.(Scenario,Crop)]
#   #no need to weight by crop area for the coefficients, because the per-grid MLRs place no weight on each grid 
#   return(coeff.m)
# }
# 
# coeff.i.samples[,P:=0]; coeff.i.samples[,RH:=0]
# 
# set.seed(27)
# sample.data = coeff.i.samples[Scenario==scenario[1]]
# boots.i <- bootstraps(sample.data, times = 1000, strata=Crop) #stratified sampling per crop to obtain sample IDs
# sample.data = coeff.r.samples[Scenario==scenario[1]]
# boots.r <- bootstraps(sample.data, times = 1000, strata=Crop) #stratified sampling per crop type
# 
# #===bootstrapping process (takes ~100mins for 1000 times)
# data <- coeff.i.samples #external input (log) to function bs.wtm; this is the real data to bootstrap
# boot_stats_i <- boots.i %>%
#   mutate(stats = map(splits, bs.coeff)) #splits is just for getting sample ID of one scenario based on sample.data
# #stats_tidy = map(stats, tidy))
# data <- coeff.r.samples #external input (log) to function bs.wtm
# boot_stats_r <- boots.r %>%
#   mutate(stats = map(splits, bs.coeff)) #eff.r.samples is the real data to bs.wtm
# 
# #===get 95% confidence interval
# alpha <- .05 
# coeff.i.low <- boot_stats_i %>%
#   unnest(stats) %>%
#   dplyr::group_by(Scenario, Crop) %>%
#   dplyr::summarise_at(c("Intercept", "T","RD","RI","P","RH"), quantile, probs = alpha/2, na.rm = TRUE)
# coeff.i.high <- boot_stats_i %>%
#   unnest(stats) %>%
#   dplyr::group_by(Scenario, Crop) %>%
#   dplyr::summarise_at(c("Intercept", "T","RD","RI","P","RH"), quantile, probs = 1 - alpha/2, na.rm = TRUE)
# coeff.i.low$Level="Conf.L"
# coeff.i.high$Level="Conf.U"
# coeff.i.ci <- as.data.table(rbind(coeff.i.low, coeff.i.high))%>%
#   mutate(Scenario = factor(Scenario, levels=scenario)) %>%
#   setorder(Scenario)
# 
# coeff.r.low <- boot_stats_r %>%
#   unnest(stats) %>%
#   dplyr::group_by(Scenario, Crop) %>%
#   dplyr::summarise_at(c("Intercept", "T","RD","RI","P","RH"), quantile, probs = alpha/2, na.rm = TRUE)
# coeff.r.high <- boot_stats_r %>%
#   unnest(stats) %>%
#   dplyr::group_by(Scenario, Crop) %>%
#   dplyr::summarise_at(c("Intercept", "T","RD","RI","P","RH"), quantile, probs = 1 - alpha/2, na.rm = TRUE)
# coeff.r.low$Level="Conf.L"
# coeff.r.high$Level="Conf.U"
# coeff.r.ci <- as.data.table(rbind(coeff.r.low, coeff.r.high))%>%
#   mutate(Scenario = factor(Scenario, levels=scenario)) %>%
#   setorder(Scenario)  
# 
# coeff.i.ci$Irrig=TRUE
# coeff.r.ci$Irrig=FALSE
# coeff.ci.boots <- rbind(coeff.i.ci, coeff.r.ci)
# coeff.ci.boots <- as.data.table(coeff.ci.boots)
# 
# coeff.i.samples.m$Irrig=TRUE
# coeff.r.samples.m$Irrig=FALSE
# coeff.samples.m <- rbind(coeff.i.samples.m, coeff.r.samples.m, fill=TRUE)
# coeff.samples.m$Level <- "Mean"
# coeff.ci.boots <- rbind(coeff.samples.m, coeff.ci.boots, fill=TRUE)
# 
# save(list=c("coeff.ci.boots"), file = paste0(wd1,"/data/RData/MLR-pergrid-coeffs-Bootstrap-1000.Rdata"))











